PM 566 Lab 5

Author

Erica Shin

Setup in R

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(dtplyr)
library(data.table)
Warning: package 'data.table' was built under R version 4.4.1

Attaching package: 'data.table'
The following objects are masked from 'package:dplyr':

    between, first, last
# met data
if (!file.exists("met_all.gz"))
  download.file(
    url = "https://raw.githubusercontent.com/USCbiostats/data-science-data/master/02_met/met_all.gz",
    destfile = "met_all.gz",
    method   = "libcurl",
    timeout  = 60
    )
met <- data.table::fread("met_all.gz")

# Download the data
stations <- fread("https://noaa-isd-pds.s3.amazonaws.com/isd-history.csv")
stations[, USAF := as.integer(USAF)]
Warning in eval(jsub, SDenv, parent.frame()): NAs introduced by coercion
#stations <- as.data.frame(stations)

# Dealing with NAs and 999999
stations[, USAF   := fifelse(USAF == 999999, NA_integer_, USAF)]
stations[, CTRY   := fifelse(CTRY == "", NA_character_, CTRY)]
stations[, STATE  := fifelse(STATE == "", NA_character_, STATE)]

# Selecting the three relevant columns, and keeping unique records
stations <- unique(stations[, list(USAF, CTRY, STATE)])

# Dropping NAs
stations <- stations[!is.na(USAF)]

# Removing duplicates
stations[, n := 1:.N, by = .(USAF)]
stations <- stations[n == 1,][, n := NULL]

#stations <- as.data.frame(stations)

# Merging
dat <- merge(
  # Data
  x     = met,      
  y     = stations, 
  # List of variables to match
  by.x  = "USAFID",
  by.y  = "USAF", 
  # Which obs to keep?
  all.x = TRUE,      
  all.y = FALSE
  )

head(dat[, c('USAFID', 'WBAN', 'STATE')], n = 4)
Key: <USAFID>
   USAFID  WBAN  STATE
    <int> <int> <char>
1: 690150 93121     CA
2: 690150 93121     CA
3: 690150 93121     CA
4: 690150 93121     CA
#changing combined dataset from data.table to data.frame
dat <- as.data.frame(dat)

Question 1: Representative station for the US

#temp
quantile(dat$temp, na.rm=TRUE)
   0%   25%   50%   75%  100% 
-40.0  19.6  23.5  27.8  56.0 
#wind speed
quantile(dat$wind.sp, na.rm=TRUE)
  0%  25%  50%  75% 100% 
 0.0  0.0  2.1  3.6 36.0 
#atm pressure
quantile(dat$atm.press, na.rm=TRUE)
    0%    25%    50%    75%   100% 
 960.5 1011.8 1014.1 1016.4 1059.9 
#finding median values
median_temp <- quantile(dat$temp, na.rm=TRUE, 0.5)
median_wind <- quantile(dat$wind.sp, na.rm=TRUE, 0.5)
median_press <- quantile(dat$atm.press, na.rm=TRUE, 0.5)

closest_temp_ID <- dat[which.min(abs(dat$temp - median_temp)), "USAFID"]
closest_wind_ID <- dat[which.min(abs(dat$wind.sp - median_wind)), "USAFID"]
closest_press_ID <- dat[which.min(abs(dat$atm.press - median_press)), "USAFID"]

#viewing all station IDs
c(closest_temp_ID, closest_wind_ID, closest_press_ID)
[1] 720113 690150 690150

Two out of the three stations coincide. The weather stations are 720113 and 690150.

Question 2: Representative station per state

medians_by_state <- dat |>
  group_by(STATE) |>
  summarize(
    median_temp = median(temp, na.rm = TRUE),
    median_wind = median(wind.sp, na.rm = TRUE),
    median_press = median(atm.press, na.rm = TRUE)
  )

medians_by_state
# A tibble: 48 × 4
   STATE median_temp median_wind median_press
   <chr>       <dbl>       <dbl>        <dbl>
 1 AL           25.3         1.5        1015.
 2 AR           25.6         2.1        1014.
 3 AZ           29           3.1        1011.
 4 CA           21.1         2.6        1013.
 5 CO           18.9         2.6        1014.
 6 CT           22.2         2.1        1015.
 7 DE           24.4         2.6        1015.
 8 FL           27           2.6        1015.
 9 GA           26           1.5        1015.
10 IA           21           2.6        1015.
# ℹ 38 more rows
median_stations <- dat |>
  group_by(STATE) |>
  summarize(
    rep_temp = USAFID[which.min(abs(temp - medians_by_state$median_temp[match(STATE, medians_by_state$STATE)]))],
  rep_wind = USAFID[which.min(abs(wind.sp - medians_by_state$median_wind[match(STATE, medians_by_state$STATE)]))],
    rep_press = USAFID[which.min(abs(atm.press - medians_by_state$median_press[match(STATE, medians_by_state$STATE)]))]
  )
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
  always returns an ungrouped data frame and adjust accordingly.
`summarise()` has grouped output by 'STATE'. You can override using the
`.groups` argument.
median_stations
# A tibble: 46 × 4
# Groups:   STATE [46]
   STATE rep_temp rep_wind rep_press
   <chr>    <int>    <int>     <int>
 1 AL      720265   720265    720361
 2 AR      720175   720172    720175
 3 AZ      720339   720339    722720
 4 CA      690150   690150    690150
 5 CO      720528   720385    722817
 6 CT      725027   720545    725027
 7 DE      724088   724088    724093
 8 FL      720373   720373    720383
 9 GA      720257   720257    722070
10 IA      720293   720293    725420
# ℹ 36 more rows
median_list <- c(median_temp, median_wind, median_press)

#creating function to calculate euclidean distance
euclid_dist <- function(temp, wind, pressure) {
  sqrt((temp - median_list[1])^2 + 
       (wind - median_list[2])^2 + 
       (pressure - median_list[3])^2)
}

#calculating distances for each station
dat1 <- dat |>
  rowwise() |>
  mutate(distance = euclid_dist(temp, wind.sp, atm.press)) |>
  ungroup()

#finding the representative station per state
representative_per_state <- dat1 |>
  group_by(STATE) |>
  filter(distance == min(distance, na.rm = TRUE)) |>
  arrange(lat) |>
  slice(1)
Warning: There were 2 warnings in `filter()`.
The first warning was:
ℹ In argument: `distance == min(distance, na.rm = TRUE)`.
ℹ In group 26: `STATE = "ND"`.
Caused by warning in `min()`:
! no non-missing arguments to min; returning Inf
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
#printing results
print(representative_per_state)
# A tibble: 46 × 33
# Groups:   STATE [46]
   USAFID  WBAN  year month   day  hour   min   lat    lon  elev wind.dir
    <int> <int> <int> <int> <int> <int> <int> <dbl>  <dbl> <int>    <int>
 1 722269  3850  2019     8    10    10    56  31.3  -85.7    92      260
 2 723407  3953  2019     8    19     2    53  35.8  -90.6    84      250
 3 723740 23194  2019     8     4    10    56  35.0 -111.   1488       20
 4 722975 53141  2019     8    18    18    56  33.8 -118.     11      200
 5 724767 93069  2019     8    27    15    53  37.3 -109.   1803      140
 6 725084 54767  2019     8     4     2    52  41.7  -72.2    75      190
 7 724088 13707  2019     8     4     6    56  39.1  -75.5     9      210
 8 722246  3844  2019     8    11     9    56  30.6  -86.5    59      280
 9 722270 13864  2019     8    14     8    56  33.9  -84.5   326      210
10 725420 14931  2019     8    16     0    53  40.8  -91.1   214      100
# ℹ 36 more rows
# ℹ 22 more variables: wind.dir.qc <chr>, wind.type.code <chr>, wind.sp <dbl>,
#   wind.sp.qc <chr>, ceiling.ht <int>, ceiling.ht.qc <int>,
#   ceiling.ht.method <chr>, sky.cond <chr>, vis.dist <int>, vis.dist.qc <chr>,
#   vis.var <chr>, vis.var.qc <chr>, temp <dbl>, temp.qc <chr>,
#   dew.point <dbl>, dew.point.qc <chr>, atm.press <dbl>, atm.press.qc <int>,
#   rh <dbl>, CTRY <chr>, STATE <chr>, distance <dbl>

Question 3: In the middle?

library(leaflet)

#creating data frame that contains all midpoints of the 50 states
midpoints <- data.frame(
  STATE = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", 
            "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", 
            "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", 
            "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", 
            "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", 
            "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", 
            "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", 
            "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"),
  Latitude = c(32.8060, 61.3704, 34.1682, 34.9697, 36.7783, 39.5501, 41.5978, 39.3185,
               27.7663, 33.0406, 21.0943, 44.2405, 40.6331, 39.7663, 42.0046, 39.0119,
               37.6681, 31.1695, 45.3676, 39.0639, 42.2302, 43.3266, 46.3920, 32.7416,
               37.9643, 46.9219, 41.4925, 38.5020, 43.1939, 40.2989, 34.8405, 43.2994,
               35.6309, 47.5369, 40.4173, 35.5653, 43.8041, 40.5908, 41.5801, 33.8569,
               44.2998, 35.7478, 31.9686, 39.3200, 44.0459, 37.4316, 47.7511, 38.5976,
               43.7844, 43.0759),
  Longitude = c(-86.7911, -152.4044, -111.9300, -92.3731, -119.4179, -105.7821, -72.7554,
                -75.5071, -81.6868, -83.6431, -157.4983, -114.4788, -89.3985, -86.4413,
                -93.2113, -98.4842, -84.6701, -91.8671, -68.9722, -76.8021, -71.5301,
                -84.5361, -94.6360, -89.6787, -91.8318, -110.4544, -99.9018, -116.3009,
                -71.5724, -74.5210, -106.2485, -74.2179, -79.8064, -99.7930, -82.9071,
                -96.9289, -120.5542, -77.2098, -71.4774, -80.9450, -99.4389, -86.6923,
                -99.9018, -111.0937, -72.7107, -78.6569, -120.7401, -80.4549, -88.7879,
                -107.2903)
)

midpoints
            STATE Latitude Longitude
1         Alabama  32.8060  -86.7911
2          Alaska  61.3704 -152.4044
3         Arizona  34.1682 -111.9300
4        Arkansas  34.9697  -92.3731
5      California  36.7783 -119.4179
6        Colorado  39.5501 -105.7821
7     Connecticut  41.5978  -72.7554
8        Delaware  39.3185  -75.5071
9         Florida  27.7663  -81.6868
10        Georgia  33.0406  -83.6431
11         Hawaii  21.0943 -157.4983
12          Idaho  44.2405 -114.4788
13       Illinois  40.6331  -89.3985
14        Indiana  39.7663  -86.4413
15           Iowa  42.0046  -93.2113
16         Kansas  39.0119  -98.4842
17       Kentucky  37.6681  -84.6701
18      Louisiana  31.1695  -91.8671
19          Maine  45.3676  -68.9722
20       Maryland  39.0639  -76.8021
21  Massachusetts  42.2302  -71.5301
22       Michigan  43.3266  -84.5361
23      Minnesota  46.3920  -94.6360
24    Mississippi  32.7416  -89.6787
25       Missouri  37.9643  -91.8318
26        Montana  46.9219 -110.4544
27       Nebraska  41.4925  -99.9018
28         Nevada  38.5020 -116.3009
29  New Hampshire  43.1939  -71.5724
30     New Jersey  40.2989  -74.5210
31     New Mexico  34.8405 -106.2485
32       New York  43.2994  -74.2179
33 North Carolina  35.6309  -79.8064
34   North Dakota  47.5369  -99.7930
35           Ohio  40.4173  -82.9071
36       Oklahoma  35.5653  -96.9289
37         Oregon  43.8041 -120.5542
38   Pennsylvania  40.5908  -77.2098
39   Rhode Island  41.5801  -71.4774
40 South Carolina  33.8569  -80.9450
41   South Dakota  44.2998  -99.4389
42      Tennessee  35.7478  -86.6923
43          Texas  31.9686  -99.9018
44           Utah  39.3200 -111.0937
45        Vermont  44.0459  -72.7107
46       Virginia  37.4316  -78.6569
47     Washington  47.7511 -120.7401
48  West Virginia  38.5976  -80.4549
49      Wisconsin  43.7844  -88.7879
50        Wyoming  43.0759 -107.2903
closest_stations <- midpoints |>
  rowwise() |>
  mutate(
    USAFID = dat$USAFID[which.min(sqrt((dat$lat - Latitude)^2 + (dat$lon - Longitude)^2))]
  ) |>
  ungroup()

combined <- closest_stations |>
  left_join(dat |> select(USAFID, lat, lon), by = "USAFID")

map <- leaflet() |>
  addTiles()  

map <- map |>
  addCircleMarkers(data = combined,
                   ~lon, ~lat,
                   color = "red",  # Color for closest station midpoints
                   radius = 5,
                   popup = ~paste("Station ID:", USAFID))
map <- map |>
  addCircleMarkers(data = midpoints,
                   ~Longitude, ~Latitude,
                   color = "blue",  # Color for state midpoints
                   radius = 7,
                   label = ~paste("State Midpoint:", STATE),
                   group = "State Midpoints")
map

Question 4: Means of means

#calculating average temperature for each state
average_temps <- dat |>
  group_by(STATE) |>
  summarize(
    avg_temp = mean(temp, na.rm = TRUE),
    avg_wind = mean(wind.sp, na.rm = TRUE),
    avg_press = mean(atm.press, na.rm = TRUE),
    .groups = 'drop'
  )

#classifying states based on average temperature
average_temps <- average_temps |>
  mutate(temp_level = case_when(
    avg_temp < 20 ~ "Low",
    avg_temp >= 20 & avg_temp < 25 ~ "Mid",
    avg_temp >= 25 ~ "High"
  ))

#generating the summary table
summary_table <- average_temps |>
  group_by(temp_level) |>
  summarize(
    num_entries = n(),
    num_na_entries = sum(is.na(dat$temp)),
    num_stations = n_distinct(dat$USAFID),
    num_states = n_distinct(dat$STATE),
    mean_temp = mean(dat$temp, na.rm = TRUE),
    mean_wind = mean(dat$wind.sp, na.rm = TRUE),
    mean_press = mean(dat$atm.press, na.rm = TRUE),
    .groups = 'drop'
  )

#displaying the summary table
print(summary_table)
# A tibble: 3 × 8
  temp_level num_entries num_na_entries num_stations num_states mean_temp
  <chr>            <int>          <int>        <int>      <int>     <dbl>
1 High                12          60089         1595         48      23.6
2 Low                 11          60089         1595         48      23.6
3 Mid                 25          60089         1595         48      23.6
# ℹ 2 more variables: mean_wind <dbl>, mean_press <dbl>